################################################################################
## LOAD PACKAGES
################################################################################

library(tidyverse)        # A collection of R packages for data manipulation and visualization (includes ggplot2, dplyr, etc.)
library(dplyr)            # For data manipulation (e.g., filtering, grouping, summarizing)
library(forcats)          # For working with factors (e.g., reordering, modifying levels)
library(stringr)          # For string manipulation (e.g., pattern matching, cleaning)
library(janitor)          # For data cleaning (e.g., cleaning column names)
library(data.table)       # For fast and efficient data manipulation, especially with large datasets
library(lubridate)        # For date-time manipulation and conversion
library(ggtext)           # For enhanced text rendering in ggplot (e.g., HTML/Markdown support)
library(ggimage)          # For adding images into ggplot2 plots (e.g., logos or icons)
library(grid)             # For customizing and combining graphical objects
library(cowplot)          # For combining ggplot figures and extracting legends
library(scales)           # For custom scales in ggplot2 (e.g., axis formatting)
library(factoextra)       # For visualizing multivariate analysis (PCA, clustering, etc.)
library(corrplot)         # For visualizing correlation matrices
library(broom)            # For converting model outputs into tidy data frames
library(psych)            # For psychometric and multivariate analysis (e.g., factor analysis, reliability)
library(EFAtools)         # For exploratory factor analysis and related tools
library(EFA.dimensions)   # For dimension estimation in exploratory factor analysis
library(GPArotation)      # For factor rotation methods (e.g., Varimax, Promax)
library(lavaan)           # For structural equation modeling (SEM), including CFA
library(semPlot)          # For visualizing SEM models
library(semptools)        # For customizing SEM diagrams and enhancing semPlot visuals
#remotes::install_version("mirt", version = "1.43") # Previous version of mirt needed
library(mirt)             # For multidimensional item response theory (IRT) modeling
library(forecast)         # For time series analysis and forecasting models (e.g., ARIMA)
library(vegan)            # For ecological and multivariate data analysis (e.g., ordination, diversity indices)
library(readxl)           # For reading Excel files (.xlsx and .xls formats)
library(writexl)          # For writing Excel files (.xlsx format)
#library(xlsx)             # For reading and writing older Excel files (.xls/.xlsx)
library(xtable)           # For creating LaTeX or HTML tables from data frames
library(gtools)           # Miscellaneous R programming tools (e.g., mixedSort, permutations)

################################################################################
## SET WORKING DIRECTORY
################################################################################

getwd()

#setwd() # Set working directory here

################################################################################
## LOAD DATA: CITIZENS' OPINIONS
################################################################################

# Load data

votesmart.ireland.df <- read.csv("votesmart_ireland_df.csv")

################################################################################
## FIND OUTLIERS (DURATION)
################################################################################

# Remove sessions lasting 70 seconds or less
votesmart.ireland.df <- votesmart.ireland.df %>% filter(duration > 70000)

# Check summary description of duration before removing outliers
votesmart.ireland.duration.w.outliers <- summary(votesmart.ireland.df$duration)

# Calculate Q1 (25th percentile) and Q3 (75th percentile) of duration
votesmart.ireland.q1 <- quantile(votesmart.ireland.df$duration, 0.25)
votesmart.ireland.q3 <- quantile(votesmart.ireland.df$duration, 0.75)

# Calculate the interquartile range (IQR)
irq.ireland <- votesmart.ireland.q3 - votesmart.ireland.q1

# Define lower and upper bounds for outliers (using 1.5 * IQR method)
lower.bound.ireland <- votesmart.ireland.q1 - 1.5 * irq.ireland
upper.bound.ireland <- votesmart.ireland.q3 + 1.5 * irq.ireland

# Identify outliers by checking if the duration is outside the defined bounds
votesmart.ireland.df$outlier <- votesmart.ireland.df$duration < lower.bound.ireland | votesmart.ireland.df$duration > upper.bound.ireland

# Show a summary of the outlier status (TRUE for outliers, FALSE for non-outliers)
table(votesmart.ireland.df$outlier)

# Drop rows that are outliers (outlier == FALSE)
votesmart.ireland.df <- votesmart.ireland.df %>% filter(outlier == "FALSE") 

# Check summary description of duration after removing outliers
votesmart.ireland.duration.wo.outliers <- summary(votesmart.ireland.df$duration)

# Remove sessions lasting less than 180 seconds
votesmart.ireland.df <- votesmart.ireland.df %>% filter(duration >= 180000)

################################################################################
## PERCENTAGE OF BOOSTED ITEMS
################################################################################

# Load labels
labels.df <- read_excel("labels_ireland.xlsx", col_names = TRUE)

# Get columns ending in "_opinion"
boosted.cols <- grep("_boosted$", names(votesmart.ireland.df), value = TRUE)

# Subset the dataframe to only _opinion columns
votesmart.ireland.boosted.df <- votesmart.ireland.df[, boosted.cols]

# Reshape the boosted dataframe
votesmart.ireland.boosted.df.long <- votesmart.ireland.boosted.df %>%
  pivot_longer(
    cols = ends_with("_boosted"),
    names_to = "code_boosted",
    values_to = "boosted"
  ) %>%
  mutate(
    code = str_remove(code_boosted, "_boosted"),
    boosted_binary = ifelse(is.na(boosted), 0, as.integer(boosted))
  )

# Join with labels to get category
votesmart.ireland.boosted.df.merged <- votesmart.ireland.boosted.df.long %>%
  left_join(labels.df, by = "code")

# Calculate percentage of TRUE responses by category
votesmart.ireland.boosted.df.summary <- votesmart.ireland.boosted.df.merged %>%
  mutate(category = str_to_title(str_to_lower(abbr))) %>%  # Convert to lowercase, then title case
  group_by(category) %>%
  summarise(
    percent_true = mean(boosted_binary) * 100,
    n = n()
  ) %>%
  arrange(desc(percent_true))

# Create plot showing percentage of salient answers per issue category
# Save the plot as a high-resolution PNG
png("figs/ireland_boosted_by_category.png", units = "in", width = 7, height = 5, res = 300)

# Reorder categories by descending percent_true
votesmart.ireland.boosted.df.summary$category <- fct_reorder(
  votesmart.ireland.boosted.df.summary$category,
  votesmart.ireland.boosted.df.summary$percent_true,
  .desc = TRUE
)

# Create the plot
ggplot(votesmart.ireland.boosted.df.summary, aes(x = category, y = percent_true, fill = category)) +
  geom_bar(stat = "identity", color = "black", size = 0.2) +
  scale_fill_manual(values = c(
    "Crime" = "#42d4f4",
    "Culture" = "#f032e6",
    "Economy" = "#800000",
    "Education" = "#9A6324",
    "Environment" = "#3cb44b",
    "Ethics" = "#f58231",
    "Foreign" = "#ffe119",
    "Housing" = "#000075",
    "Immigration" = "#e6194B",
    "Mobility" = "#4363d8",
    "State" = "#911eb4",
    "Welfare" = "#bfef45"
  )) +
  theme_minimal() +
  labs(
    x = "Issue Category",
    y = "% Boosted Answers"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 10),
    axis.text.y = element_text(size = 10),
    axis.title.x = element_blank(),
    axis.title.y = element_text(size = 11, margin = margin(r = 8)),
    plot.title = element_blank(),
    legend.position = "none"
  ) +
  geom_text(aes(label = sprintf("%.2f", percent_true)), vjust = -0.5, size = 3)

# Close the PNG device
dev.off()

# Total number of "TRUE"
votesmart.ireland.boosted <- sum(votesmart.ireland.boosted.df == "TRUE", na.rm = TRUE)

# Total number of responses in _boosted columns
votesmart.ireland.boosted.total <- nrow(votesmart.ireland.boosted.df) * ncol(votesmart.ireland.boosted.df)

# Percentages
votesmart.ireland.pct.boosted <- (votesmart.ireland.boosted / votesmart.ireland.boosted.total) * 100

# Print
votesmart.ireland.pct.boosted

# Average number of boosted statements per respondent
votesmart.ireland.avg.boosted <- mean(rowSums(votesmart.ireland.boosted.df == TRUE, na.rm = TRUE))
votesmart.ireland.avg.boosted

################################################################################
## PERCENTAGE OF EACH ANSWER CATEGORY
################################################################################

# Get columns ending in "_opinion"
opinion.cols <- grep("_opinion$", names(votesmart.ireland.df), value = TRUE)

# Subset the dataframe to only _opinion columns
votesmart.ireland.opinion.df <- votesmart.ireland.df[, opinion.cols]

# Total number of "skipped", "agree", and "disagree"
votesmart.ireland.skipped <- sum(votesmart.ireland.opinion.df == "skipped", na.rm = TRUE)
votesmart.ireland.agree <- sum(votesmart.ireland.opinion.df == "agree", na.rm = TRUE)
votesmart.ireland.disagree <- sum(votesmart.ireland.opinion.df == "disagree", na.rm = TRUE)

# Total number of responses in _opinion columns
votesmart.ireland.total <- nrow(votesmart.ireland.opinion.df) * ncol(votesmart.ireland.opinion.df)

# Percentages
votesmart.ireland.pct.skipped <- (votesmart.ireland.skipped / votesmart.ireland.total) * 100
votesmart.ireland.pct.agree <- (votesmart.ireland.agree / votesmart.ireland.total) * 100
votesmart.ireland.pct.disagree <- (votesmart.ireland.disagree / votesmart.ireland.total) * 100

# Print
votesmart.ireland.pct.skipped
votesmart.ireland.pct.agree
votesmart.ireland.pct.disagree

# Average number of boosted statements per respondent
votesmart.ireland.avg.skipped <- mean(rowSums(votesmart.ireland.opinion.df == "skipped"))
votesmart.ireland.avg.skipped

################################################################################
## PERCENTAGE OF SKIPPED ITEMS: GRAPH
################################################################################

# Load labels to rename the columns and rows of the correlation matrix
labels.df <- read_excel("labels_ireland.xlsx", col_names = TRUE)

# Step 1: Calculate proportion of "skipped"
skipped.proportions <- votesmart.ireland.df %>%
  select(all_of(opinion.cols)) %>%
  summarise(across(everything(), ~mean(. == "skipped", na.rm = TRUE))) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "skipped_proportion")

# Step 2: Extract item codes
skipped.proportions <- skipped.proportions %>%
  mutate(code = str_remove(variable, "_opinion"))

# Step 3: Join with labels.df
skipped.proportions <- skipped.proportions %>%
  left_join(labels.df, by = "code")

# Step 4: Format label as "COLORED new_column: rest of label"
skipped.proportions <- skipped.proportions %>%
  mutate(
    label_colored = paste0(
      "<span style='color:", colors, ";'><b>", toupper(abbr), "</b></span>. ", label
    )
  )

# Step 5: Create plot and save to file
png("figs/ireland_skipped_plot.png", units = "in", width = 10, height = 8, res = 300)

ggplot(skipped.proportions, aes(y = reorder(label_colored, -skipped_proportion), x = skipped_proportion)) +
  geom_col(fill = "grey40") +
  theme_minimal() +
  labs(
    y = NULL,                        # Remove y-axis title
    x = "Proportion of skipped items"     # Keep x-axis title
    # No plot title
  ) +
  theme(
    axis.text.y = element_markdown(size = 10.5),
    axis.text.x = element_text(size = 13),
    axis.title.x = element_text(margin = margin(t = 15, b = 5), size = 14),  # Larger margin above/below x axis title
    plot.title = element_blank()   # Remove plot title if any
  )

dev.off()

################################################################################
## CREATE PERSON WEIGHT
################################################################################

# Step 1: Identify opinion and boosted columns
votesmart.ireland.df.opinion.cols <- grep("_opinion$", names(votesmart.ireland.df), value = TRUE)
votesmart.ireland.df.boosted.cols <- gsub("_opinion$", "_boosted", votesmart.ireland.df.opinion.cols)

# Recode opinion values ("agree" = 1, "disagree" = 0, "skipped" = NA)
votesmart.ireland.df <- votesmart.ireland.df %>%
  mutate(across(ends_with("_opinion"), ~ recode(.x, "agree" = "1", "disagree" = "0", "skipped" = "")))

# Compute person-level weights
votesmart.ireland.boosted.matrix <- votesmart.ireland.df[, votesmart.ireland.df.boosted.cols]
votesmart.ireland.boosted.matrix[] <- lapply(votesmart.ireland.boosted.matrix, as.logical)
votesmart.ireland.boosted.matrix <- as.data.frame(lapply(votesmart.ireland.boosted.matrix, unlist))
votesmart.ireland.weight.matrix <- ifelse(as.matrix(votesmart.ireland.boosted.matrix), 2, 1)
person_weights <- rowSums(votesmart.ireland.weight.matrix, na.rm = TRUE)
person_weights <- person_weights / mean(person_weights)

votesmart.ireland.df$person_weights <- person_weights

################################################################################
## CREATE MAIN DATAFRAME FOR CITIZENS' OPINIONS
################################################################################

# Remove columns containing "_boosted" (such as salience-related columns)
votesmart.ireland.df <- select(votesmart.ireland.df, -contains("_boosted"))

# Clean column names by removing "_opinion" from all column names
votesmart.ireland.df <- votesmart.ireland.df %>% 
  rename_with(~ str_remove(., "_opinion"), everything())

# Convert opinion columns to numeric format
votesmart.ireland.df <- votesmart.ireland.df %>% 
  mutate(across(starts_with("ST"), as.numeric))

# Keep only the opinion columns, weights, duration, and date
votesmart.ireland.df <- votesmart.ireland.df %>% 
  select(starts_with('ST'), person_weights, duration, date)

# Remove columns that have only NA values
votesmart.ireland.df <- votesmart.ireland.df[, colSums(is.na(votesmart.ireland.df)) != nrow(votesmart.ireland.df)]

# Keep only complete cases (rows with no missing values)
#votesmart.ireland.df <- votesmart.ireland.df[complete.cases(votesmart.ireland.df), ]

# Create copy of dataframe including weight column
votesmart.ireland.df.wgt <- votesmart.ireland.df %>% 
  select(starts_with('ST'), person_weights)

# Select only the opinion columns for further analysis
votesmart.ireland.df <- votesmart.ireland.df %>% 
  select(starts_with('ST'))

################################################################################
## COMPUTE CORRELATIONS BY ISSUE CATEGORY
################################################################################

general.themes.df <- votesmart.ireland.df
general.themes.df <- na.omit(general.themes.df)

# Load the Excel file containing the 'code' and 'abbr' columns
themes.data <- read_excel("labels_ireland.xlsx")  

# themes.data has columns 'code' (e.g., ST1, ST2, ...) and 'abbr' (e.g., ECONOMY, EDUCATION)
# Create a mapping of the code to the abbr (theme)
ireland.theme.map <- setNames(themes.data$abbr, themes.data$code)

# R dataframe 'general.themes.df' contains columns like ST1, ST2, ..., ST348
# Rename the columns in the dataframe based on the ireland.theme.map
colnames(general.themes.df) <- sapply(colnames(general.themes.df), function(colname) {
  # Check if the column name is in the ireland.theme.map and modify accordingly
  if (colname %in% names(ireland.theme.map)) {
    return(paste(ireland.theme.map[colname], colname, sep = "_"))
  } else {
    return(colname)  # In case there is no theme for that particular code
  }
})

# Step 1: Identify unique prefixes (everything before "_")
ireland.prefixes <- unique(substring(names(general.themes.df), 1, regexpr("_", names(general.themes.df)) - 1))

# Step 2: Compute tetrachoric correlations for columns with the same prefix
for (prefix in ireland.prefixes) {
  # Step 2a: Subset the dataframe to include only columns with the same prefix
  ireland.cols.with.prefix <- grep(paste0("^", prefix, "_"), names(general.themes.df), value = TRUE)
  
  # If there are fewer than 2 columns, skip the calculation for this prefix
  if (length(ireland.cols.with.prefix) < 2) {
    cat("\nSkipping prefix", prefix, "because it has fewer than 2 columns.\n")
    next  # Skip to the next prefix
  }
  
  # Subset the dataframe
  general.themes.df.subset <- general.themes.df[, ireland.cols.with.prefix]
  
  # Step 2b: Compute the tetrachoric correlation for the subset of columns
  ireland.tetra.cor <- tetrachoric(general.themes.df.subset)
  
  # Get the correlation matrix (rho)
  ireland.corr.matrix <- ireland.tetra.cor$rho
  
  # Step 3: Compute the absolute correlation matrix
  ireland.abs.corr.matrix <- abs(ireland.corr.matrix)  # Get the absolute values of the correlation matrix
  
  # Remove diagonal values (1s)
  diag(ireland.abs.corr.matrix) <- NA
  
  # Step 4: Compute the average absolute correlation for this set of columns
  ireland.avg.abs.corr.matrix <- mean(ireland.abs.corr.matrix, na.rm = TRUE)
  
  # Step 5: Adjust the average correlation by the number of variables (columns)
  N <- length(ireland.cols.with.prefix)  # Number of variables (columns)
  ireland.avg.adj.corr <- ireland.avg.abs.corr.matrix / (N - 1)  # Adjust by N - 1 (for pairwise correlation)
  
  # Step 6: Apply Fisher Z-transformation to the absolute correlation matrix
  ireland.abs.corr.matrix <- abs(ireland.corr.matrix)  # Take absolute values of correlations first
  ireland.fisher.z.matrix <- 0.5 * log((1 + ireland.abs.corr.matrix) / (1 - ireland.abs.corr.matrix))
  
  # Step 7: Compute the average Fisher Z-transformed correlation (excluding diagonal)
  ireland.fisher.z.matrix[lower.tri(ireland.fisher.z.matrix)] <- NA  # Optional: Set lower triangle to NA to avoid duplicates
  diag(ireland.fisher.z.matrix) <- NA  # Remove diagonal values
  ireland.avg.fisher.z <- mean(ireland.fisher.z.matrix, na.rm = TRUE)
  
  # Step 8: Reverse Fisher Z-transformation to get back to the original correlation scale
  ireland.avg.fisher.corr <- (exp(2 * ireland.avg.fisher.z) - 1) / (exp(2 * ireland.avg.fisher.z) + 1)
  
  # Print the results: Average, Adjusted Average, and Fisher Z-transformed Average correlation
  cat("\nFor prefix", prefix, ":\n")
  cat("  Average absolute correlation: ", ireland.avg.abs.corr.matrix, "\n")
  cat("  Adjusted average absolute correlation: ", ireland.avg.adj.corr, "\n")
  cat("  Fisher Z-transformed average correlation (absolute): ", ireland.avg.fisher.corr, "\n")
}

# Same but save in dataframe
# Initialize list to collect results
ireland.correlation.results <- list()

for (prefix in ireland.prefixes) {
  ireland.cols.with.prefix <- grep(paste0("^", prefix, "_"), names(general.themes.df), value = TRUE)
  
  if (length(ireland.cols.with.prefix) < 2) {
    cat("\nSkipping prefix", prefix, "because it has fewer than 2 columns.\n")
    next
  }
  
  general.themes.df.subset <- general.themes.df[, ireland.cols.with.prefix]
  ireland.tetra.cor <- tetrachoric(general.themes.df.subset)
  ireland.corr.matrix <- ireland.tetra.cor$rho
  
  ireland.abs.corr.matrix <- abs(ireland.corr.matrix)
  diag(ireland.abs.corr.matrix) <- NA
  ireland.avg.abs.corr.matrix <- mean(ireland.abs.corr.matrix, na.rm = TRUE)
  N <- length(ireland.cols.with.prefix)
  ireland.avg.adj.corr <- ireland.avg.abs.corr.matrix / (N - 1)
  
  ireland.fisher.z.matrix <- 0.5 * log((1 + ireland.abs.corr.matrix) / (1 - ireland.abs.corr.matrix))
  ireland.fisher.z.matrix[lower.tri(ireland.fisher.z.matrix)] <- NA
  diag(ireland.fisher.z.matrix) <- NA
  ireland.avg.fisher.z <- mean(ireland.fisher.z.matrix, na.rm = TRUE)
  ireland.avg.fisher.corr <- (exp(2 * ireland.avg.fisher.z) - 1) / (exp(2 * ireland.avg.fisher.z) + 1)
  
  # Append results
  ireland.correlation.results[[length(ireland.correlation.results) + 1]] <- list(
    theme = prefix,
    avg_abs_corr = ireland.avg.abs.corr.matrix,
    adj_avg_abs_corr = ireland.avg.adj.corr,
    fisher_z_corr = ireland.avg.fisher.corr
  )
}

# Convert list to dataframe
ireland.correlations.df <- do.call(rbind, lapply(ireland.correlation.results, as.data.frame))

# Save to Excel
write_xlsx(ireland.correlations.df, "files/ireland_correlations_df.xlsx")

################################################################################
## LOAD DATA: PARTY POSITIONS
################################################################################

# Load the party positions data from an Excel sheet
ireland.positions <- read_excel("positions_ireland.xlsx", sheet = "ireland")

# Remove unecessary columns
ireland.positions <- ireland.positions %>%
  select(
    -matches("^Short description_"), 
    -matches("^Motivation_"), 
    -"constituency-slug", 
    -"constituency", 
    -"partyID"
  )

# Step 1: Rename party values to party1, party2, ...
ireland.positions <- ireland.positions %>%
  mutate(party = factor(party)) %>%
  mutate(party = paste0("party", as.integer(party)))

# Step 2: Reshape to long format and remove duplicates
ireland.positions.long <- ireland.positions %>%
  pivot_longer(
    cols = -party,
    names_to = c(".value", "set"),
    names_pattern = "(StatementID|Statement|Answer)_(\\d+)"
  ) %>%
  select(party, StatementID, Statement, Answer) %>%
  distinct() %>%
  rename(
    SID = StatementID,
    statement = Statement,
    value = Answer
  )

# Reshape data from long format to wide format using pivot_wider
ireland.positions.wide <- ireland.positions.long %>% 
  dplyr::select(-statement) %>%  # Remove the 'statement' column
  pivot_wider(
    names_from = SID,            # Use 'SID' to create new column names
    values_from = value          # Use 'value' to fill the new columns
  )

ireland.positions.wide <- ireland.positions.wide %>%
  mutate(across(everything(), ~replace_na(.x, 99)))

################################################################################
## KEEP COMMON COLUMNS
################################################################################

# Remove columns with constant values
votesmart.ireland.df <- votesmart.ireland.df[, sapply(votesmart.ireland.df, function(x) length(unique(x)) > 1)]
ireland.positions.wide <- ireland.positions.wide[, sapply(ireland.positions.wide, function(x) length(unique(x)) > 1)]

# Remove columns that contain one or more '99' values
ireland.positions.wide <- ireland.positions.wide[, !apply(ireland.positions.wide, 2, function(x) any(x == 99))]

# Find the common column names between the dataframes 'votesmart.ireland.df' and 'ireland.positions.wide'
ireland.common.columns <- intersect(names(votesmart.ireland.df), names(ireland.positions.wide))

# Subset the wide dataframe to keep only the common columns
ireland.positions.wide <- ireland.positions.wide[, ireland.common.columns]
votesmart.ireland.df.combine <- votesmart.ireland.df[, ireland.common.columns]

################################################################################
## CREATE MAIN DATAFRAME FOR PARTY POSITIONS
################################################################################

# Recode values in the position columns: Replace '2' with '0'
ireland.positions.wide <- ireland.positions.wide %>%
  mutate(across(everything(), ~ replace(., . == 2, 0)))

# Convert all opinion columns (of type double) to numeric format
ireland.positions.wide <- ireland.positions.wide %>%
  mutate(across(where(is.double), as.numeric))

################################################################################
## CREATE CORRELATION MATRIX: CITIZENS
################################################################################

# Count the total number of sessions in the data
n.votesmart.ireland.df <- nrow(votesmart.ireland.df.combine)

# Compute Kuder-Richardson Formula 20 (reliability coefficient)
#kr20_value <- validateR::kr20(votesmart.ireland.df.combine)

# Compute tetrachoric correlation for the opinion data
ireland.tetrachoric.corr <- tetrachoric(votesmart.ireland.df.combine)

# Extract the correlation matrix and convert it to absolute values
ireland.cor.matrix  <- abs(ireland.tetrachoric.corr$rho)

# Calculate the average absolute correlation for each row
ireland.avg.abs.corr.matrix <- rowMeans(ireland.cor.matrix)

# Convert the correlation matrix to long format (for easier visualization)
ireland.cor.matrix.long <- reshape2::melt(ireland.cor.matrix)
ireland.cor.matrix.long$Matrix <- "(a) Citizens"  # Label for the first matrix

# Perform hierarchical clustering on the correlation matrix
ireland.hc <- hclust(as.dist(1 - ireland.cor.matrix), method = "complete")

# Get the order of items after hierarchical clustering
ireland.ordered.names <- rownames(ireland.cor.matrix)[ireland.hc$order]

# Reorder the correlation matrix according to the clustering order
ireland.cor.matrix.reordered  <- ireland.cor.matrix[ireland.hc$order, ireland.hc$order]

# Load labels to rename the columns and rows of the correlation matrix
labels.df <- read_excel("labels_ireland.xlsx", col_names = TRUE)

# Clean up the labels data (add new codes and labels)
labels.df <- labels.df %>%
  #mutate(new_column = paste0("T", as.integer(factor(category)))) %>%
  mutate(new_column = paste0(abbr)) %>%
  mutate(
    new_code = paste0(new_column, ". ", code),
    new_label = paste0(new_column, ". ", label)
  ) %>%
  arrange(category)

# Merge the labels with the correlation matrix data
ireland.cor.matrix.long <- merge(ireland.cor.matrix.long, labels.df, by.x = "Var1", by.y = "code")
ireland.cor.matrix.long <- merge(ireland.cor.matrix.long, labels.df, by.x = "Var2", by.y = "code")

# Label the rows and columns of the reordered correlation matrix
labels <- labels.df$label[match(colnames(ireland.cor.matrix.reordered), labels.df$code)]
rownames(ireland.cor.matrix.reordered) <- labels

# Perform hypothesis testing to compute p-values for the correlations
ireland.ptest <- cor.mtest(ireland.cor.matrix.reordered , conf.level = 0.95)

# Calculate the proportion of statistically significant correlations (p-value < 0.05)
ireland.mean.sig <- mean(ireland.ptest$p[upper.tri(ireland.ptest$p)] < 0.05)

# Set diagonal values of the correlation matrix to NA (self-correlations should be ignored)
ireland.cor.matrix.reordered.diag <- ireland.cor.matrix.reordered 
diag(ireland.cor.matrix.reordered.diag) <- NA

# Compute the average of the non-diagonal absolute correlations
ireland.mean.abs.corr <- mean(ireland.cor.matrix.reordered.diag, na.rm = TRUE)

# Flatten the matrix into a vector for easier analysis
ireland.cor.vector <- as.vector(ireland.cor.matrix.reordered.diag)

# Count the number of correlations with absolute value greater than or equal to 0.4 and 0.2
ireland.num.above.0.4 <- sum(abs(ireland.cor.vector) >= 0.4, na.rm = TRUE)
ireland.num.above.0.2 <- sum(abs(ireland.cor.vector) >= 0.2, na.rm = TRUE)

# Calculate the total number of correlations
ireland.num.total <- length(ireland.cor.vector)

# Calculate the percentage of correlations above the thresholds
ireland.percentage.above.0.4 <- (ireland.num.above.0.4 / ireland.num.total) * 100
ireland.percentage.above.0.2 <- (ireland.num.above.0.2 / ireland.num.total) * 100

################################################################################
## CREATE CORRELATION MATRIX: PARTIES
################################################################################

# Apply the Kuder-Richardson Formula 20 to validate the data
#pkr20_value <- validateR::kr20(ireland.positions.wide)

# Compute the tetrachoric correlation matrix
ireland.tetrachoric.corr.p <- tetrachoric(ireland.positions.wide)
ireland.cor.matrix.p  <- ireland.tetrachoric.corr.p$rho

# Take the absolute values of the correlation matrix
ireland.cor.matrix.p  <- abs(ireland.cor.matrix.p)

# Convert the correlation matrix to long format for easier manipulation
ireland.cor.matrix.p.long <- reshape2::melt(abs(ireland.cor.matrix.p))
ireland.cor.matrix.p.long$Matrix <- "(b) Party leaders"  # Label the matrix for visualization

# Merge with 'labels.df' to get descriptive labels for the columns/rows
ireland.cor.matrix.p.long <- merge(ireland.cor.matrix.p.long, labels.df, by.x = "Var1", by.y = "code")
ireland.cor.matrix.p.long <- merge(ireland.cor.matrix.p.long, labels.df, by.x = "Var2", by.y = "code")

# Perform hierarchical clustering on the correlation matrix
ireland.hc.p <- hclust(as.dist(1 - ireland.cor.matrix.p), method = "complete")

# Reorder the correlation matrix based on the hierarchical clustering order
ireland.cor.matrix.p.reordered  <- ireland.cor.matrix.p[ireland.ordered.names, ireland.ordered.names]

# Assign labels to the reordered matrix columns and rows based on 'labels.df'
labels <- labels.df$label[match(colnames(ireland.cor.matrix.p.reordered), labels.df$code)]
rownames(ireland.cor.matrix.p.reordered) <- labels

# Compute the confidence intervals for each pair of items in the correlation matrix
pitemsCols <- 1:ncol(ireland.cor.matrix.p.reordered)
ireland.ptest.p <- cor.mtest(ireland.cor.matrix.p.reordered[ , pitemsCols], conf.level = 0.95)

# Calculate the proportion of statistically significant correlations
ireland.mean.sig.p <- mean(ireland.ptest.p$p[upper.tri(ireland.ptest.p$p)] < 0.05)

# Set diagonal values of the matrix to NA (ignores self-correlations)
ireland.cor.matrix.p.reordered.diag <- ireland.cor.matrix.p.reordered 
diag(ireland.cor.matrix.p.reordered.diag) <- NA

# Compute the average of the non-diagonal absolute correlations
ireland.mean.abs.corr.p <- mean(ireland.cor.matrix.p.reordered.diag, na.rm = TRUE)

# Flatten the matrix into a vector for further analysis
ireland.cor.vector.p <- as.vector(ireland.cor.matrix.p.reordered.diag)

# Count the number of correlations with an absolute value ≥ 0.4
ireland.num.above.0.4.p <- sum(abs(ireland.cor.vector.p) >= 0.4, na.rm = TRUE)

# Count the number of correlations with an absolute value ≥ 0.2
ireland.num.above.0.2.p <- sum(abs(ireland.cor.vector.p) >= 0.2, na.rm = TRUE)

# Calculate the total number of correlations
ireland.num.total.p <- length(ireland.cor.vector.p)

# Calculate the percentage of correlations that are ≥ 0.4 and ≥ 0.2
ireland.percentage.above.0.4.p <- (ireland.num.above.0.4.p / ireland.num.total.p) * 100
ireland.percentage.above.0.2.p <- (ireland.num.above.0.2.p / ireland.num.total.p) * 100

################################################################################
## CORRELATION BETWEEN THE TWO MATRICES
################################################################################

# Extract correlation matrices from tetrachoric objects
ireland.tetrachoric.matrix <- ireland.tetrachoric.corr$rho
ireland.tetrachoric.matrix.p <- ireland.tetrachoric.corr.p$rho

# Helper to flatten lower triangle
flatten_corr <- function(corr_matrix) {
  corr_matrix[lower.tri(corr_matrix)]
}

# 1. SCATTERPLOT OF CORRELATIONS WITH CORRELATION VALUE
ireland.vec.citizens <- flatten_corr(ireland.tetrachoric.matrix)
ireland.vec.parties <- flatten_corr(ireland.tetrachoric.matrix.p)
ireland.vec <- data.frame(Citizen = ireland.vec.citizens, Politician = ireland.vec.parties)

# Compute Pearson correlation
ireland.corr.val <- cor(ireland.vec$Citizen, ireland.vec$Politician, use = "complete.obs")

# Format correlation value to show two decimal places
ireland.corr.val <- sprintf("%.2f", ireland.corr.val)

png("figs/ireland_scatter_plot.png", units = "in", width = 10, height = 8, res = 300)

# Create the plot with improved styling
ireland.scatter.plot <- ggplot(ireland.vec, aes(x = Citizen, y = Politician)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "red", se = FALSE) +
  scale_x_continuous(limits = c(-1, 1), breaks = seq(-1, 1, by = 0.2)) +  # Custom x-axis
  annotate("text", x = min(ireland.vec$Citizen, na.rm = TRUE) - 0.28,  # Move to left
           y = max(ireland.vec$Politician, na.rm = TRUE) + 0.05,  # Move to top
           label = paste0("r = ", ireland.corr.val),
           hjust = 0, vjust = 1, size = 4.5) +
  labs(x = "Citizens' Tetrachoric Correlations",
       y = "Party Leaders' Tetrachoric Correlations") +
  theme_minimal(base_size = 16) +
  theme(
    plot.title = element_blank(),  # remove title
    axis.title.x = element_text(margin = margin(t = 15), size = 16),
    axis.title.y = element_text(margin = margin(r = 15), size = 16),
    axis.text = element_text(size = 13)
  )

ireland.scatter.plot

dev.off()

# 2. DIFFERENCE HEATMAP (clipped to [-1, 1])
par(mfrow = c(1, 1))  # Reset layout
ireland.diff.corr <- ireland.tetrachoric.matrix - ireland.tetrachoric.matrix.p
ireland.diff.corr.clipped <- pmax(pmin(ireland.diff.corr, 1), -1)  # clip values to [-1, 1]

corrplot(ireland.diff.corr.clipped, method = "color", tl.cex = 0.7, mar = c(0, 0, 2, 0))
#title("Citizen - Politician Correlations", line = 1)

################################################################################
## CREATE FACETED PLOT
################################################################################

# Combine the correlation matrices (from citizens and parties) into a single long-format dataframe
ireland.long.combined <- rbind(ireland.cor.matrix.long, ireland.cor.matrix.p.long)

# Select and organize necessary columns for further analysis and visualization
ireland.long.combined <- ireland.long.combined %>%
  select(new_code.x, new_code.y, new_label.x, new_label.y, value, category.x, category.y, Matrix)

# Convert codes to factors for better handling of categorical data
ireland.long.combined$new_code.x <- as.factor(ireland.long.combined$new_code.x)
ireland.long.combined$new_code.y <- as.factor(ireland.long.combined$new_code.y)

# Calculate the average absolute correlation for each row (excluding diagonal)
ireland.long.combined <- ireland.long.combined %>%
  group_by(new_code.x, Matrix) %>%
  mutate(avg_corr = mean(abs(value[new_code.x != new_code.y]), na.rm = TRUE)) %>%
  ungroup()

# Prepare data for labeling the average correlation values at the end of each row
ireland.avg.corr.labels <- ireland.long.combined %>%
  distinct(new_code.x, avg_corr, Matrix) %>%
  mutate(x_position = max(as.numeric(ireland.long.combined$new_code.y)))

# Merge with 'labels.df' to get descriptive labels for the codes
ireland.avg.corr.labels <- ireland.avg.corr.labels %>%
  mutate(new_code.x = as.character(new_code.x)) %>% 
  left_join(labels.df, by = c("new_code.x" = "new_code")) %>%
  rename(new_label.x = new_label)

# Calculate the maximum x-position from the data for plot adjustments
ireland.max.x.position <- max(as.numeric(ireland.long.combined$new_code.y))

# Set an offset for the average correlation labels (to avoid overlap)
ireland.offset.x <- 1.25  # A smaller offset to avoid pushing labels too far out

# Prepare the labels for average correlation, adjusting the position to the right of the plot
ireland.avg.corr.labels <- ireland.avg.corr.labels %>%
  mutate(x_position = ireland.max.x.position + ireland.offset.x)

# Merge the average correlation data with labels to ensure proper matching
ireland.avg.corr.labels <- left_join(ireland.avg.corr.labels, labels.df, by = "code")
ireland.avg.corr.labels <- ireland.avg.corr.labels %>%
  rename(new_label.y = new_label)

# Reorder the labels according to the ordered names
ireland.ordered.names.df <- data.frame(code = ireland.ordered.names)
ireland.merged.data <- merge(ireland.ordered.names.df, labels.df, by = "code")
ireland.labeled.vector <- ireland.merged.data$new_label
ireland.ordered.names <- ireland.merged.data$new_code

# Update the factor levels of the columns based on ordered names
ireland.long.combined$new_code.x <- factor(ireland.long.combined$new_code.x, levels = ireland.ordered.names)
ireland.long.combined$new_label.y <- factor(ireland.long.combined$new_label.y, levels = ireland.labeled.vector)

# Filter and prepare data for the citizens matrix (Matrix == "(a) Citizens")
ireland.avg.corr.labels_citizens <- ireland.avg.corr.labels[ireland.avg.corr.labels$Matrix == "(a) Citizens", ]

# Order the rows in the citizens matrix by average correlation values
ireland.avg.corr.labels_citizens$new_code.x <- factor(ireland.avg.corr.labels_citizens$new_code.x, 
                                                         levels = ireland.avg.corr.labels_citizens$new_code.x[order(ireland.avg.corr.labels_citizens$avg_corr)])
ireland.new.order <- levels(ireland.avg.corr.labels_citizens$new_code.x)

# Reorder the factors based on the new order for the plot
ireland.ordered.names.df <- data.frame(new_code = ireland.new.order)
ireland.merged.data <- left_join(ireland.ordered.names.df, labels.df, by = "new_code")
ireland.labeled.vector <- ireland.merged.data$new_label
ireland.ordered.names <- ireland.ordered.names.df$new_code

# Update the levels of the factor for plotting
ireland.long.combined$new_code.x <- factor(ireland.long.combined$new_code.x, levels = ireland.ordered.names)
ireland.long.combined$new_label.y <- factor(ireland.long.combined$new_label.y, levels = ireland.labeled.vector)

# Final plot creation using ggplot2
png("figs/ireland_combined_g.png", units="in", width=17, height=9, res=1200)

# Create the ggplot
ggplot(ireland.long.combined, aes(new_code.x, new_label.y, fill = value)) +  # Define the basic aesthetics (x and y variables, fill color)
  
  # Add geom_tile() for creating heatmap-like tiles
  geom_tile() +
  
  # Set the color scale with gradient from white to black, and adjust limits and breaks for the scale
  scale_fill_gradient(
    low = "white",  # Color for low values
    high = "black",  # Color for high values
    limits = c(0, max(abs(ireland.long.combined$value))),  # Set the limits of the color scale to cover all values
    breaks = seq(0, max(abs(ireland.long.combined$value)), by = 0.2)  # Specify breaks for color scale
  ) + 
  
  # Use a minimal theme for a clean background
  theme_minimal() +
  
  # Facet the plot into multiple panels based on the "Matrix" variable
  facet_wrap(~ Matrix) + 
  
  # Rotate the x-axis labels for better visibility (90-degree rotation)
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  
  # Add the plot title and caption with dynamic text
  labs(title = "Comparison of Two Correlation Matrices",
       x = "Variables", y = "Variables",
       caption = paste("Number of online sessions:", comma(n.votesmart.ireland.df),
                       "\nMean absolute correlation coefficient: (a)", sprintf("%.2f", ireland.mean.abs.corr), 
                       "(b)", sprintf("%.2f", ireland.mean.abs.corr.p),
                       "\nPercentage of correlations ≥ 0.2: (a)", sprintf("%.2f%%", ireland.percentage.above.0.2), 
                       "(b)", sprintf("%.2f%%", ireland.percentage.above.0.2.p),
                       "\nPercentage of correlations ≥ 0.4: (a)", sprintf("%.2f%%", ireland.percentage.above.0.4), 
                       "(b)", sprintf("%.2f%%", ireland.percentage.above.0.4.p), 
                       "\nCorrelation of correlations:", ireland.corr.val)) +
  
  # Customize text sizes, remove titles, and format the plot to improve readability
  theme(
    axis.text.x = element_text(size = 10.5, angle = 90, vjust = 0.5),  # Rotate x-axis labels and adjust size
    axis.text.y = element_text(size = 10.5),  # Adjust size of y-axis labels
    axis.title.x = element_blank(),  # Remove x-axis title
    axis.title.y = element_blank(),  # Remove y-axis title
    legend.title = element_blank(),  # Remove legend title
    plot.title = element_blank(),    # Remove plot title
    panel.grid.major = element_blank(),  # Remove major grid lines
    panel.grid.minor = element_blank(),  # Remove minor grid lines
    plot.caption = element_text(size = 12),  # Adjust caption size
    strip.text = element_text(size = 16),  # Adjust size of facet labels
    panel.spacing.x = unit(.3, "cm"),  # Set space between panels
    legend.position = "bottom",  # Position the legend at the bottom
    legend.direction = "horizontal",  # Make the legend horizontal
    legend.key.height = unit(0.4, "cm"),  # Adjust the height of the legend keys
    legend.key.width = unit(2, "cm"),  # Adjust the width of the legend keys
    legend.text = element_text(size = 11),  # Adjust legend text size
    plot.margin = margin(0, 0, 0, 0, "cm")  # Set plot margins to zero for a tighter fit
  ) +
  
  # Customize x-axis labels using ggtext for HTML formatting to add color
  scale_x_discrete(labels = function(x) {
    x <- gsub("^ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", x, perl = TRUE)  # Add color for ECFL
    x <- gsub("^EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", x, perl = TRUE)  # Add color for EDUC
    x <- gsub("^ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", x, perl = TRUE)  # Add color for ENEG
    x <- gsub("^ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", x, perl = TRUE)  # Add color for ETHE
    x <- gsub("^FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", x, perl = TRUE)  # Add color for FAED
    x <- gsub("^WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", x, perl = TRUE)  # Add color for HSWF
    x <- gsub("^IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", x, perl = TRUE)  # Add color for IMIN
    x <- gsub("^CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", x, perl = TRUE)  # Add color for JLEF
    x <- gsub("^CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", x, perl = TRUE)  # Add color for MECU
    x <- gsub("^MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", x, perl = TRUE)  # Add color for MPTR
    x <- gsub("^HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", x, perl = TRUE)  # Add color for SPHO
    x <- gsub("^STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", x, perl = TRUE)  # Add color for SRPI
    return(x)
  }) +
  
  # Similarly, customize the y-axis labels with color formatting
  scale_y_discrete(labels = function(y) {
    y <- gsub("ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", y, perl = TRUE)  # Add color for ECFL
    y <- gsub("EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", y, perl = TRUE)  # Add color for EDUC
    y <- gsub("ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", y, perl = TRUE)  # Add color for ENEG
    y <- gsub("ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", y, perl = TRUE)  # Add color for ETHE
    y <- gsub("FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", y, perl = TRUE)  # Add color for FAED
    y <- gsub("WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", y, perl = TRUE)  # Add color for HSWF
    y <- gsub("IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", y, perl = TRUE)  # Add color for IMIN
    y <- gsub("CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", y, perl = TRUE)  # Add color for JLEF
    y <- gsub("CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", y, perl = TRUE)  # Add color for MECU
    y <- gsub("MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", y, perl = TRUE)  # Add color for MPTR
    y <- gsub("HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", y, perl = TRUE)  # Add color for SPHO
    y <- gsub("STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", y, perl = TRUE)  # Add color for SRPI
    return(y)
  }) +
  
  # Enable Markdown formatting for axis labels (to support HTML formatting)
  theme(axis.text.x = element_markdown(),
        axis.text.y = element_markdown()) +  
  
  # Add average absolute correlation labels at the end of each row, only for certain matrices
  geom_text(data = ireland.avg.corr.labels %>%
              filter(Matrix == "(a) Citizens" | Matrix == "(b) Party leaders"),  # Filter data for the labels
            aes(x = x_position, y = new_label.y, label = sprintf("%.2f", avg_corr)),
            inherit.aes = FALSE, size = 4, hjust = 0, vjust = 0.5) +  # Adjust label positioning
  
  # Expand x-axis to accommodate labels just outside the plot area
  coord_cartesian(xlim = c(NA, ireland.max.x.position + ireland.offset.x + 2))  # Set x-axis limit

# Save the plot to a file
dev.off()

################################################################################
## CONFIRMATORY FACTOR ANALYSIS
################################################################################

# Ensure that `labels_df$code` contains only the columns that exist in `votesmart.ireland.df`
# This step filters out any codes in `labels_df` that do not have corresponding columns in the data frame
labels_df <- labels.df %>% filter(code %in% colnames(votesmart.ireland.df.wgt))

# Create a named vector `rename_map` that maps each old column name to a new name
# The new names combine the old code and corresponding topic (e.g., "code_topic")
rename_map <- setNames(paste0(labels_df$code, "_", labels_df$topic), labels_df$code)

# Rename the columns of `votesmart.ireland.df` according to `rename_map`
# This ensures that column names are more informative by combining the code and the topic
votesmart.ireland.df.cfa <- votesmart.ireland.df.wgt %>%
  rename_with(~ rename_map[.x], .cols = all_of(labels_df$code))

# Remove NA values
votesmart.ireland.df.cfa <- na.omit(votesmart.ireland.df.cfa)

## SPECIFY MODELS
################################################################################

# (A) One-factor model: All items load onto a single "ideology" factor

# Extract the column names from the data frame (excluding "_Other" items)
votesmart.ireland.items <- colnames(votesmart.ireland.df.cfa)
votesmart.ireland.items <- votesmart.ireland.items[!grepl("_Other", votesmart.ireland.items) & 
                                                         votesmart.ireland.items != "person_weights"]

# Define the CFA model for the one-factor model: all variables are indicators of a single latent factor "ideology"
votesmart.ireland.cfa.model.1f <- paste0("ideology =~ ", paste(votesmart.ireland.items, collapse = " + "))

# (B) Two-factor model: Items are split into two factors: Economic vs Socio-cultural

# Identify variables related to economic and socio-cultural factors based on their suffix
votesmart.ireland.economic.vars <- grep("_Economic$", names(votesmart.ireland.df.cfa), value = TRUE)
votesmart.ireland.cultural.vars <- grep("_Cultural$", names(votesmart.ireland.df.cfa), value = TRUE)

# Remove the suffix "_Economic" from the economic variable names for better readability
votesmart.ireland.economic.vars <- gsub("_Economic$", "", unlist(votesmart.ireland.economic.vars))

# Remove the suffix "_Cultural" from the cultural variable names for better readability
votesmart.ireland.cultural.vars <- gsub("_Cultural$", "", unlist(votesmart.ireland.cultural.vars))

# Define the CFA model for the two-factor model: 
# Economic items load on the "Economic" factor and Cultural items load on the "Cultural" factor
votesmart.ireland.cfa.model.2f <- paste(
  "Economic =~", paste(votesmart.ireland.economic.vars, collapse = " + "), "\n",
  "Cultural =~", paste(votesmart.ireland.cultural.vars, collapse = " + ")
)

## RUN CFA
################################################################################

# Fit the one-factor model using the WLSMV estimator (appropriate for binary data)
votesmart.ireland.cfa.fit.1f <- cfa(votesmart.ireland.cfa.model.1f, 
                                      data = votesmart.ireland.df.cfa, 
                                      sampling.weights = "person_weights",
                                      ordered = colnames(votesmart.ireland.df.cfa), 
                                      estimator = "WLSMV")

# Clean up the column names of the data by removing everything after the first "_"
cols_to_rename <- setdiff(colnames(votesmart.ireland.df.cfa), "person_weights")
new_names <- sapply(strsplit(cols_to_rename, "_"), `[`, 1)
colnames(votesmart.ireland.df.cfa)[colnames(votesmart.ireland.df.cfa) != "person_weights"] <- new_names

# Fit the two-factor model (no estimator specified here as it's already default for binary data)
votesmart.ireland.cfa.fit.2f <- cfa(votesmart.ireland.cfa.model.2f, 
                                      data = votesmart.ireland.df.cfa, 
                                      sampling.weights = "person_weights",
                                      ordered = colnames(votesmart.ireland.df.cfa))

## DIAGNOSTICS AND MODEL COMPARISON
################################################################################

# Inspect standardized factor loadings to check if they are sufficiently strong (typically > 0.5)
parameterEstimates(votesmart.ireland.cfa.fit.1f, standardized = TRUE) %>%
  filter(op == "=~") # Extract factor loadings for the one-factor model
parameterEstimates(votesmart.ireland.cfa.fit.2f, standardized = TRUE) %>%
  filter(op == "=~") # Extract factor loadings for the two-factor model

# Check overall model fit indices for both models (values close to 1 indicate good fit)
summary(votesmart.ireland.cfa.fit.1f, fit.measures = TRUE, standardized = TRUE)
summary(votesmart.ireland.cfa.fit.2f, fit.measures = TRUE, standardized = TRUE)

# Extract fit statistics
votesmart.ireland.cfa.fit.stats.f1 <- fitMeasures(votesmart.ireland.cfa.fit.1f, c("chisq", "df", "cfi", "tli", "rmsea", "srmr"))
votesmart.ireland.cfa.fit.stats.f2 <- fitMeasures(votesmart.ireland.cfa.fit.2f, c("chisq", "df", "cfi", "tli", "rmsea", "srmr"))

# Create a data frame with selected fit statistics
fit_table <- data.frame(
  Model = c("One-Factor", "Two-Factor"),
  RMSEA = c(votesmart.ireland.cfa.fit.stats.f1["rmsea"], votesmart.ireland.cfa.fit.stats.f2["rmsea"]),
  CFI   = c(votesmart.ireland.cfa.fit.stats.f1["cfi"], votesmart.ireland.cfa.fit.stats.f2["cfi"]),
  TLI   = c(votesmart.ireland.cfa.fit.stats.f1["tli"], votesmart.ireland.cfa.fit.stats.f2["tli"]),
  SRMR  = c(votesmart.ireland.cfa.fit.stats.f1["srmr"], votesmart.ireland.cfa.fit.stats.f2["srmr"])
)

# Round values for readability
fit_table[, 2:5] <- round(fit_table[, 2:5], 3)

# Run chi-square difference test
lrt_result <- lavTestLRT(votesmart.ireland.cfa.fit.1f, votesmart.ireland.cfa.fit.2f)
lrt_table <- as.data.frame(lrt_result)

# Generate LaTeX table for fit indices
fit_xtable <- xtable(fit_table, caption = "Fit Statistics for One-Factor and Two-Factor Models")
print(fit_xtable, include.rownames = FALSE, caption.placement = "top", booktabs = TRUE)

# Generate LaTeX table for chi-square difference test
lrt_xtable <- xtable(lrt_table, caption = "Chi-Square Difference Test Between Models")
print(lrt_xtable, include.rownames = FALSE, caption.placement = "top", booktabs = TRUE)

## INTERPRETATION
################################################################################

# Retrieve standardized factor loadings for the two-factor model
inspect(votesmart.ireland.cfa.fit.2f, what = "std")$lambda

# Check the internal consistency (reliability) of the economic and cultural factors using Cronbach's alpha
psych::alpha(votesmart.ireland.df.cfa[, votesmart.ireland.economic.vars])  # Reliability for Economic Factor
psych::alpha(votesmart.ireland.df.cfa[, votesmart.ireland.cultural.vars])  # Reliability for Cultural Factor

# Save the SEM plot of the two-factor model to a png file
png("figs/ireland_cfa_sem.png", units="in", width=25, height=15, res=300)

# Plot the two-factor model with standardized factor loadings displayed on the paths
semPaths(
  votesmart.ireland.cfa.fit.2f,
  what = "path", 
  whatLabels = "std",        # Display standardized estimates on the paths
  style = "ram",             # Use RAM-style diagram for better clarity
  layout = "spring",         # Force-directed layout for better separation of nodes
  fixedStyle = "red",        # Color fixed paths in red
  freeStyle = "black",       # Color free paths in black
  edge.width = 0.4,          # Set edge width for better visibility
  border.width = 0.2,        # Set border width of nodes
  border.color = "black",    # Set border color of nodes
  asize = 1,                 # Set arrow size for paths
  edge.label.bg = TRUE,      # Add background to edge labels for readability
  edge.label.cex = 0.5,      # Increase edge label size for readability
  edge.label.position = 0.7, # Position edge labels at the center of edges
  sizeMan = 4,               # Set the size of observed variable boxes
  sizeLat = 5,               # Set the size of latent variable ovals for emphasis
  color = list(lat = "lightgrey", man = "transparent"),  # Color scheme for variables
  title = FALSE,             # Remove default title from the plot
  intercepts = FALSE,        # Exclude intercepts from the plot
  thresholds = FALSE         # Exclude thresholds from the plot
)

# Add model fit statistics as text annotations on the plot
x_pos <- 0.90
y_pos <- 0.95

text(x = x_pos, y = y_pos, labels = paste("RMSEA:", round(votesmart.ireland.cfa.rmsea, 3), 
                                          "\nCFI:", round(votesmart.ireland.cfa.cfi, 3), 
                                          "\nTLI:", round(votesmart.ireland.cfa.tli, 3)),
     cex = 1.5, col = "black")

dev.off()

# Extract factor loadings from the two-factor model and clean up the 'rhs' variable for clarity
votesmart.ireland.cfa.f2.loadings <- parameterEstimates(votesmart.ireland.cfa.fit.2f, standardized = TRUE) %>%
  filter(op == "=~")

# Modify the 'rhs' column to keep only the part before the "_"
votesmart.ireland.cfa.f2.loadings$rhs <- sub("_.*", "", votesmart.ireland.cfa.f2.loadings$rhs)

# Merge the factor loadings with the corresponding labels from `labels_df`
votesmart.ireland.cfa.f2.loadings <- merge(votesmart.ireland.cfa.f2.loadings, labels_df, by.x = "rhs", by.y = "code")

# Reorder the 'label' variable within each latent factor based on the absolute value of the factor loading
votesmart.ireland.cfa.f2.loadings <- votesmart.ireland.cfa.f2.loadings %>%
  group_by(lhs) %>%
  mutate(label = factor(label, levels = label[order(abs(est))])) %>%
  ungroup()

# Save the factor loadings plot as a PNG file
png("figs/ireland_cfa_f2_loadings.png", units="in", width=7, height=6, res=1200)

# Create a bar plot of the factor loadings for each variable within each factor
ggplot(votesmart.ireland.cfa.f2.loadings, aes(x = est, y = label, fill = lhs)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_manual(values = c("#ff7f0e", "#1f77b4")) +  # Customize bar colors
  #scale_x_continuous(limits = c(-4, 4), breaks = seq(-4, 4, by = 2)) +  # Set x-axis range
  labs(
    x = "Loading strength",  # x-axis label for the plot
    y = NULL,  # Remove y-axis label
    fill = "Latent variable"  # Legend title
  ) +
  theme_minimal() +
  theme(
    axis.title.x = element_text(size = 10, margin = margin(t = 8)),  # Customize x-axis title appearance
    legend.title = element_text(size = 9),
    legend.position = "bottom",  # Place legend at the bottom
    legend.key.width = unit(0.5, "cm"),  # Adjust legend key width
    legend.key.height = unit(0.5, "cm")  # Adjust legend key height
  )

dev.off()

################################################################################
## EXPLORATORY FACTOR ANALYSIS
################################################################################

ireland.tetrachoric.matrix <- tetrachoric(votesmart.ireland.df)$rho
#ireland.tetrachoric.matrix <- hetcor(votesmart.ireland.df)$cor

# Kaiser-Meyer-Olkin criterion: tests the suitability of the data for factor analysis
KMO(ireland.tetrachoric.matrix)

# Bartlett's test of sphericity: tests whether the correlation matrix is an identity matrix, indicating if factor analysis is appropriate
BARTLETT(votesmart.ireland.df)

# Eigenvalue method (“Kaiser’s rule”): calculates eigenvalues to determine the number of factors to retain
ireland.ev <- eigen(ireland.tetrachoric.matrix) # get eigenvalues
#ireland.ev <- eigen(cor(votesmart.ireland.df, use = 'pairwise.complete.obs')) # get eigenvalues
ireland.ev$values  # returns the eigenvalues for each factor

# Scree plot: visualizes the eigenvalues to help determine the number of factors to retain
scree(ireland.tetrachoric.matrix, pc=FALSE)  # Use pc=FALSE for factor analysis, without Principal Components

# Parallel analysis scree plot: compares observed eigenvalues with simulated eigenvalues to decide on the number of factors
set.seed(123)  # Set random seed for reproducibility
ireland.parallel <- fa.parallel(ireland.tetrachoric.matrix, fa='fa')  # Parallel analysis
ireland.parallel  # Output the results of parallel analysis
ireland.parallel$nfact  # Suggested number of factors

# Create data frame from observed eigenvalue data for plotting
ireland.obs <- data.frame(ireland.parallel$fa.values)
ireland.obs$type <- c('Observed Data')  # Label the data as observed
ireland.obs$num <- c(row.names(ireland.obs))  # Add factor number as column
ireland.obs$num <- as.numeric(ireland.obs$num)  # Convert factor number to numeric
colnames(ireland.obs) <- c('eigenvalue', 'type', 'num')  # Set column names

# Calculate quantiles for eigenvalues from simulated data, storing the 95th percentile values
percentile <- apply(ireland.parallel$values,2,function(x) quantile(x,.95))
ireland.min <- as.numeric(nrow(ireland.obs))  # Minimum value for selecting the range
ireland.min <- (4*ireland.min) - (ireland.min-1)  # Adjust minimum index
ireland.max <- as.numeric(nrow(ireland.obs))  # Maximum value for selecting the range
ireland.max <- 4*ireland.max  # Adjust maximum index
ireland.percentile1 <- percentile[ireland.min:ireland.max]  # Select the 95th percentile eigenvalues

# Create a data frame for the simulated eigenvalue data
ireland.sim <- data.frame(ireland.percentile1)
ireland.sim$type <- c('Simulated Data (95th %ile)')  # Label the data as simulated
ireland.sim$num <- c(row.names(ireland.obs))  # Add factor number as column
ireland.sim$num <- as.numeric(ireland.sim$num)  # Convert factor number to numeric
colnames(ireland.sim) <- c('eigenvalue', 'type', 'num')  # Set column names

# Merge the observed and simulated eigenvalue data frames
ireland.eigendat <- rbind(ireland.obs,ireland.sim)

# Define a custom theme for the plot
apatheme = theme_bw() + 
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.background = element_blank(),
        panel.border = element_blank(),
        text = element_text(family = 'Arial'),
        legend.title = element_blank(),
        legend.position = c(.7, .8),
        axis.line.x = element_line(color = 'black'),
        axis.line.y = element_line(color = 'black'))

# Save the scree plot as a png file
png("figs/ireland_scree_g.png", units="in", width=8, height=6, res=1200)

# Create the scree plot using ggplot2
ireland.scree <- ggplot(ireland.eigendat, aes(x = num, y = eigenvalue, shape = type)) +
  geom_line() +  # Add lines connecting data points
  geom_point(size = 2.5) +  # Add points for each eigenvalue
  scale_y_continuous(name = 'Eigenvalue', limits = c(-.6, 5), breaks = seq(0, 5, 1)) +  # Y-axis labels and limits
  scale_x_continuous(name = 'Factor Number', breaks = min(ireland.eigendat$num):max(ireland.eigendat$num)) +  # X-axis labels and limits
  scale_shape_manual(values = c(16, 1)) +  # Define different shapes for observed and simulated data
  geom_vline(xintercept = ireland.parallel$nfact, linetype = 'dashed') +  # Add vertical line for suggested number of factors
  theme_bw() +  # Apply custom theme
  theme(axis.title.x = element_text(size = 12, margin = margin(t = 18, r = 0, b = 0, l = 0)),
        axis.title.y = element_text(size = 12, margin = margin(t = 0, r = 18, b = 0, l = 0)),
        axis.text.x = element_text(size = 12, color = "black", margin = margin(t = 8, r = 0, b = 0, l = 0)),
        axis.text.y = element_text(size = 12, color = "black", margin = margin(t = 0, r = 8, b = 0, l = 0)),
        legend.position = c(.82, .92),
        legend.title = element_blank(),
        legend.text = element_text(size = 10),
        legend.background = element_blank(),
        legend.key = element_rect(colour = NA, fill = NA),
        axis.ticks.length = unit(.18, "cm"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

# Call the plot to render it
ireland.scree

# Close the file device to save the plot
dev.off()

# Very Simple Structure (VSS) criterion: assesses the simplicity of the factor structure
ireland.vss <- VSS(ireland.tetrachoric.matrix)   

# Perform VSS with oblimin rotation
ireland.vss.r <- VSS(ireland.tetrachoric.matrix, rotate = "oblimin")
ireland.vss.r

# Create a scree plot for VSS
VSS.scree(ireland.tetrachoric.matrix, main = "scree plot")
#VSS.scree(cor(votesmart.ireland.df, use = 'pairwise.complete.obs'), main = "scree plot")

# Save the VSS plot as a png file
png("figs/ireland_vss_g.png", units = "in", width = 8, height = 6, res = 300)

# Run and display the results of VSS
ireland.vss <- VSS(ireland.tetrachoric.matrix)   
ireland.vss

# Close the file device for VSS plot
dev.off()

# Velicer's minimum average partial (MAP) test: helps determine the number of factors to retain by minimizing residual variance
MAP(votesmart.ireland.df, corkind = 'polychoric', verbose = TRUE)

# Testing different factor models with up to 4 factors
#ireland.nf <- nfactors(votesmart.ireland.df, n = 4, fm = "ml", cor = "tet")
#ireland.nf 

# Extract and rotate factors (4 factors in this case)
ireland.Nfacs <- 4

# Perform factor analysis using oblimin/varimax rotation
#cor_fa <- cor(votesmart.ireland.df, use = 'pairwise.complete.obs')
ireland.fit.obl <- factanal(covmat = ireland.tetrachoric.matrix, factors = ireland.Nfacs, rotation = "oblimin") # oblimin rotation 
ireland.fit.var <- factanal(covmat = ireland.tetrachoric.matrix, factors = ireland.Nfacs, rotation = "varimax") # varimax rotation 

# Print the factor analysis results, showing loadings greater than 0.3
print(ireland.fit.obl, digits = 5, cutoff = 0.3, sort = TRUE)

# Extract the factor loadings
ireland.efa.loads <- ireland.fit.obl$loadings

# Perform factor analysis using oblimin/varimax rotation again using fa
ireland.fit.obl.fa <- psych::fa(ireland.tetrachoric.matrix, nfactors = ireland.Nfacs, rotate = "oblimin") # oblimin rotation 

# Extract the factor loadings
ireland.efa.loads.fa <- ireland.fit.obl.fa$loadings

# Calculate the proportion of variance explained by each factor
# Sum of squared loadings for each factor (this represents the explained variance)
ireland.efa.ss <- apply(ireland.efa.loads.fa^2, 2, sum)

# The total variance is 1 for standardized data, so the proportion explained by each factor
ireland.efa.var <- ireland.efa.ss/length(ireland.fit.obl.fa$communality)

# Create a dataframe with the proportion variance for each factor
ireland.efa.var.df <- data.frame(Factor = paste0("F", 1:ireland.Nfacs), 
                                    Proportion_Variance = ireland.efa.var)

# Write 'dataframe' dataframe to Excel
write_xlsx(ireland.efa.var.df, "files/ireland_efa_var_df.xlsx")

# Extract the inter-factor correlation matrix (Phi)
ireland.efa.inter <- ireland.fit.obl.fa$Phi

# Create a dataframe from the inter-factor correlation matrix
ireland.efa.inter.df <- data.frame(ireland.efa.inter)

# Move row names (which represent factors) into a new column called 'Factor'
ireland.efa.inter.df$Factor <- rownames(ireland.efa.inter.df)

# Replace "MR" with "F" in the Factor column (e.g., MR1 → F1)
ireland.efa.inter.df$Factor <- gsub("MR", "F", ireland.efa.inter.df$Factor)

# Also clean column names by removing ".MR" to keep only factor names (e.g., F.MR1 → F1)
colnames(ireland.efa.inter.df) <- gsub("\\.MR", "", colnames(ireland.efa.inter.df))

# Reshape the dataframe from wide to long format for pairwise correlations
ireland.efa.inter.df.long <- ireland.efa.inter.df %>%
  # Pivot all columns except 'Factor' into long format
  pivot_longer(cols = -Factor, names_to = "Other_Factor", values_to = "Correlation") %>%
  # Remove rows where a factor is correlated with itself (diagonal)
  filter(Factor != Other_Factor) %>%
  # Create a unique identifier for each pair (sorted alphabetically to avoid duplicates)
  mutate(pair = pmap_chr(list(Factor, Other_Factor), ~ paste(sort(c(..1, ..2)), collapse = "_"))) %>%
  # Keep only one row per unique factor pair
  distinct(pair, .keep_all = TRUE) %>%
  # Rename columns for clarity
  select(Factor1 = Factor, Factor2 = Other_Factor, Correlation)

# Ensure 'Factor2' labels are also cleaned (if still in MR format, though this may be redundant)
ireland.efa.inter.df.long$Factor2 <- gsub("MR", "F", ireland.efa.inter.df.long$Factor2)

# Remove rows where Factor1 and Factor2 are the same (i.e., self-correlations)
ireland.efa.inter.df.long <- ireland.efa.inter.df.long %>%
  filter(Factor1 != Factor2)

# Write 'ireland.efa.inter.df.long' dataframe to Excel
write_xlsx(ireland.efa.inter.df.long, "files/ireland_efa_inter_correlations.xlsx")

# Create a diagram of the factor loadings
fa.diagram(ireland.efa.loads)

# Transform factor loadings into a dataframe for easier analysis and manipulation
ireland.efa.loads.df <- as.data.frame.matrix(ireland.efa.loads)

# Transform row names (factor names) into a column
ireland.efa.loads.df <- rownames_to_column(ireland.efa.loads.df, var = "statements")

# Set values with absolute loadings <= 0.3 to NA for clarity
ireland.efa.loads.df[sapply(ireland.efa.loads.df, is.numeric)] <- lapply(ireland.efa.loads.df[sapply(ireland.efa.loads.df, is.numeric)], function(x) {
  x[abs(x) <= 0.3] <- NA
  return(x)
})

# Merge the factor loading data frame with another data frame containing labels for the statements
# This merges 'ireland.efa.loads.df' with 'labels.df' on the 'statements' column in 'ireland.efa.loads.df'
# and 'code' column in 'labels.df' to associate labels with factor loadings.
ireland.efa.loads.df <- merge(ireland.efa.loads.df, labels.df, by.x = "statements", by.y = "code")

# Select and organize necessary columns for further analysis and visualization
# Here we keep only the 'label' column and the four factors (Factor1 to Factor4) from the merged dataframe.
ireland.efa.loads.df <- ireland.efa.loads.df %>%
  select(label, Factor1, Factor2, Factor3, Factor4)

# Create a LaTeX table of the factor loadings for reporting purposes
# This converts 'ireland.efa.loads.df' into a LaTeX-compatible table with a caption and label for referencing in a report.
ireland.efa.loads.table <- xtable(ireland.efa.loads.df, caption = "Factor Loadings", label = "tab1")

# Save the LaTeX table to a .tex file
sink("files/ireland_loads_table.tex")  # This redirects output to the file
print(ireland.efa.loads.table, type = "latex", comment = FALSE, include.rownames = FALSE)  # Prints the table in LaTeX format
sink()  # Turns off the sink function, so the output returns to the console

# Add a new column 'ids' that contains the row names of the data frame (useful for identifying rows)
ireland.efa.loads.df$ids <- rownames(ireland.efa.loads.df)

# Select relevant columns, including 'ids', 'label', and the factor columns, into a new data frame
ireland.factor.loads <- ireland.efa.loads.df[, c("ids", "label", "Factor1", "Factor2", "Factor3", "Factor4")]

# Melt the data from wide format to long format, which is necessary for ggplot to visualize the data
# This converts the factor loading data from multiple columns into a single 'variable' column and a 'value' column.
ireland.efa.loads.df <- reshape2::melt(ireland.factor.loads)

# Replace the string "Factor" with "F" in the 'variable' column for cleaner labeling in the plot
ireland.efa.loads.df$variable <- gsub("Factor", "F", ireland.efa.loads.df$variable)

ireland.efa.loads.df$ids <- as.factor(ireland.efa.loads.df$ids)
ireland.efa.loads.df$label <- as.factor(ireland.efa.loads.df$label)

# Merge ireland.efa.loads.df with labels.df
ireland.efa.loads.df <- merge(ireland.efa.loads.df, labels.df, by = "label")

# Add ideological leaning to statement labels
ireland.efa.loads.df <- ireland.efa.loads.df %>%
  mutate(
    new_label = paste0(ideology, " ", new_label)  
  )

# Create a png file to save the heatmap

# The resolution is set to 1200 dpi for high-quality output, and the image size is specified in inches (6x6).
png("figs/ireland_efa_loadings.png", units="in", width=7, height=6, res=1200)

# Sort the levels of the 'new_label' based on the substring starting from the 5th character
# Ensure there are no duplicate levels by using unique() before assigning the levels
ireland.efa.loads.df$new_label <- factor(ireland.efa.loads.df$new_label, 
                                            levels = unique(ireland.efa.loads.df$new_label[order(substr(ireland.efa.loads.df$new_label, 5, nchar(ireland.efa.loads.df$new_label)))]))

# Create the heatmap using ggplot2
# The 'x' axis represents the factors, 'y' represents the ids (statements),
# and 'fill' represents the factor loading values.
ggplot(ireland.efa.loads.df, aes(x = variable, y = new_label, fill = value)) + 
  geom_tile() +  # Create a tile-based heatmap
  geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))), size=2.5) + 
  scale_fill_gradient2(midpoint = 0, low = "#FFE338", high = "#FF0000", mid = "white", 
                       na.value = "lightgrey") +  # Color scale from yellow to red, with NA values in light grey
  labs(fill = "Loading") +  # Label for the color scale legend
  theme_minimal() +  # Apply a minimal theme for the plot
  scale_y_discrete(labels = function(y) {
    # Add color formatting
    y <- gsub("ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", y, perl = TRUE)  # Add color for ECFL
    y <- gsub("EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", y, perl = TRUE)  # Add color for EDUC
    y <- gsub("ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", y, perl = TRUE)  # Add color for ENEG
    y <- gsub("ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", y, perl = TRUE)  # Add color for ETHE
    y <- gsub("FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", y, perl = TRUE)  # Add color for FAED
    y <- gsub("WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", y, perl = TRUE)  # Add color for HSWF
    y <- gsub("IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", y, perl = TRUE)  # Add color for IMIN
    y <- gsub("CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", y, perl = TRUE)  # Add color for JLEF
    y <- gsub("CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", y, perl = TRUE)  # Add color for MECU
    y <- gsub("MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", y, perl = TRUE)  # Add color for MPTR
    y <- gsub("HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", y, perl = TRUE)  # Add color for SPHO
    y <- gsub("STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", y, perl = TRUE)  # Add color for SRPI
    return(y)
  }) +
  theme(axis.text.y = element_markdown(),
        axis.text.x = element_text(hjust = .5),  # Center the x-axis labels
        axis.title = element_blank(),  # Remove axis titles
        legend.title = element_text(size = 9, margin = margin(r = 0.3, t = -1.8, unit = "lines")),  # Customize the legend title text size and margins
        legend.position = "bottom",  # Place the legend at the bottom
        legend.direction = "horizontal",  # Set the legend items to be horizontal
        legend.key.height = unit(0.4, "cm"), # Reduce the height of the legend scale
        legend.key.width = unit(.8, "cm")) # Increase the width of the legend scale

dev.off()  # Close the png file and save the plot

# Factor analysis diagram

png("figs/ireland_fa_diagram_g.png", units="in", width=15, height=12, res=1200)

ireland.fa.result <- psych::fa(ireland.tetrachoric.corr$rho, nfactors = 4, rotate = "oblimin")
colnames(ireland.fa.result$loadings) <- c("Factor 1", "Factor 2", "Factor 3", "Factor 4")
fa.diagram(ireland.fa.result, simple = FALSE, cut = 0, cex = 1, l.cex = 1, digits = 2)

dev.off()

################################################################################
## ITEM RESPONSE THEORY
################################################################################

# Remove any rows with missing data from the dataset
# This ensures that the model estimation doesn't fail due to NA values
votesmart.ireland.df.complete <- votesmart.ireland.df.wgt[complete.cases(votesmart.ireland.df.wgt), ]

# Set seed
set.seed(123)

# Randomly sample 10,000 complete cases from the cleaned dataset
# This reduces the data size for quicker model estimation
votesmart.ireland.df.sample <- votesmart.ireland.df.complete[
  sample(nrow(votesmart.ireland.df.complete), 
         size = min(10000, nrow(votesmart.ireland.df.complete))), 
]

# Fit a unidimensional 2-Parameter Logistic (2PL) IRT model to the sample
# Assumes there is one underlying latent trait (dimension)
ireland.2PL.1D <- mirt(votesmart.ireland.df.sample[, !names(votesmart.ireland.df.sample) %in% "person_weights"],
                          model = 1, itemtype = '2PL', 
                          weights = votesmart.ireland.df.sample$person_weights)

summary(ireland.2PL.1D)

# Fit a bidimensional 2PL IRT model to the same sample
# Assumes two latent traits or factors influence item responses
ireland.2PL.2D <- mirt(votesmart.ireland.df.sample[, !names(votesmart.ireland.df.sample) %in% "person_weights"],
                          model = 2, itemtype = '2PL', 
                          weights = votesmart.ireland.df.sample$person_weights)

summary(ireland.2PL.2D)

# Fit a three-dimensional 2PL IRT model to the same sample
# Assumes two latent traits or factors influence item responses
ireland.2PL.3D <- mirt(votesmart.ireland.df.sample[, !names(votesmart.ireland.df.sample) %in% "person_weights"],
                          model = 3, itemtype = '2PL', 
                          weights = votesmart.ireland.df.sample$person_weights)

summary(ireland.2PL.3D)

# Fit a four-dimensional 2PL IRT model to the same sample
# Assumes two latent traits or factors influence item responses
ireland.2PL.4D <- mirt(votesmart.ireland.df.sample[, !names(votesmart.ireland.df.sample) %in% "person_weights"],
                          model = 4, itemtype = '2PL', 
                          weights = votesmart.ireland.df.sample$person_weights)

summary(ireland.2PL.4D)

# Compare the two models (1D vs. 2D) using a likelihood ratio test
# Helps determine whether the added complexity of a 2D model significantly improves fit
anova(ireland.2PL.1D, ireland.2PL.2D)

# Compare the two models (2D vs. 3D) using a likelihood ratio test
# Helps determine whether the added complexity of a 3D model significantly improves fit
anova(ireland.2PL.2D, ireland.2PL.3D)

# Compare the two models (3D vs. 4D) using a likelihood ratio test
# Helps determine whether the added complexity of a 4D model significantly improves fit
anova(ireland.2PL.3D, ireland.2PL.4D)

# Extract the factor loadings
ireland.irt.loads <- summary(ireland.2PL.4D)$rotF

# Calculate the proportion of variance explained by each factor
# Sum of squared loadings for each factor (this represents the explained variance)
ireland.irt.ss <- apply(ireland.irt.loads^2, 2, sum)

# The total variance is 1 for standardized data, so the proportion explained by each factor
ireland.irt.var <- ireland.irt.ss/nrow(ireland.irt.loads)

# Create a dataframe with the proportion variance for each factor
ireland.irt.var.df <- data.frame(Factor = paste0("F", 1:4), 
                                    Proportion_Variance = ireland.irt.var)

# Write 'dataframe' dataframe to Excel
write_xlsx(ireland.irt.var.df, "files/ireland_irt_var_df.xlsx")

# Extract item parameters from the fitted multidimensional 2PL model
# Set IRTpars = TRUE to return discrimination (a) and difficulty (b) parameters
ireland.item.params <- coef(ireland.2PL.4D, IRTpars = TRUE, simplify = TRUE)

# Combine item-level parameter lists into a single data frame
# Remove the last element if it's the $Group (typically contains test-level info)
ireland.item.params.df <- do.call(rbind, ireland.item.params[-length(ireland.item.params)]) 
ireland.item.params.df <- as.data.frame(ireland.item.params.df)

# Add item names as a column (they're currently row names)
ireland.item.params.df$Item <- rownames(ireland.item.params.df)

# Step 1: Compute total discrimination (α)
# This is the euclidean norm of the a-vector across the four latent dimensions
# α = sqrt(a1^2 + a2^2 + a3^2 + a4^2), where a1–a4 are the item discriminations on each dimension
ireland.item.params.df$discrimination <- sqrt(ireland.item.params.df$a1^2 + ireland.item.params.df$a2^2 + ireland.item.params.df$a3^2 + ireland.item.params.df$a4^2)

# Step 2: Compute difficulty (b) for each item
# In multidimensional 2PL, difficulty is derived from the intercept (d) as:
# b = -d / α
# This gives a sense of how "hard" it is to agree with the item (i.e., threshold location)
ireland.item.params.df$b_calculated <- -ireland.item.params.df$d / ireland.item.params.df$discrimination

# Step 3: Clean the data
# Exclude any items where discrimination is missing or zero
# (This avoids divide-by-zero issues or meaningless points in the plot)
ireland.item.params.df.clean <- ireland.item.params.df[!is.na(ireland.item.params.df$discrimination) & ireland.item.params.df$discrimination > 0, ]

# Step 4: Plot item difficulty vs. discrimination
# Each point is an item; labels show the item name
# Higher discrimination = more informative item
# Higher difficulty = item requires higher latent trait level to agree with

png("figs/ireland_mirt_parameters.png", units = "in", width = 8, height = 6, res = 300)

ggplot(ireland.item.params.df.clean, aes(x = b_calculated, y = discrimination, label = rownames(ireland.item.params.df.clean))) +
  geom_point(color = "#0072B2", size = 3) +
  geom_text(vjust = -0.6, size = 4) +
  theme_minimal() +
  labs(
    # Removed the title
    x = "Difficulty (b)",
    y = "Discrimination (α)"
    #caption = "Computed from multidimensional 2PL IRT model"
  ) +
  theme(
    axis.title.x = element_text(size = 16, margin = margin(t = 12)),  # Top margin for x-axis title
    axis.title.y = element_text(size = 16, margin = margin(r = 12)),  # Right margin for y-axis title
    axis.text = element_text(size = 14),          # Larger axis tick labels
    plot.caption = element_text(size = 12),       # Optional: caption size
    panel.grid.minor = element_blank()
  )

dev.off()

# Store IRT model summary in an object
ireland.2PL.4D.sum <- summary(ireland.2PL.4D)

# Extract the factor correlation matrix
ireland.2PL.4D.corr <- ireland.2PL.4D.sum$fcor

# Convert to dataframe
ireland.2PL.4D.corr.df <- as.data.frame(ireland.2PL.4D.corr)

# Convert to matrix if not already
ireland.2PL.4D.corr.mat <- as.matrix(ireland.2PL.4D.corr.df)

# Get the lower triangle indices (excluding diagonal)
ireland.2PL.4D.low.tri <- which(lower.tri(ireland.2PL.4D.corr.mat), arr.ind = TRUE)

# Create the long-format dataframe
ireland.2PL.4D.cor.pairs.df <- data.frame(
  vaa = "Ireland",
  Factor1 = rownames(ireland.2PL.4D.corr.mat)[ireland.2PL.4D.low.tri[, 1]],
  Factor2 = colnames(ireland.2PL.4D.corr.mat)[ireland.2PL.4D.low.tri[, 2]],
  Correlation = ireland.2PL.4D.corr.mat[ireland.2PL.4D.low.tri]
)

# Save to Excel file
write_xlsx(ireland.2PL.4D.cor.pairs.df, "files/ireland_irt_inter_correlations.xlsx")

# Extract factor loadings (discrimination parameters) from the 4-dimensional MIRT model
ireland.mirt.loads <- extract.mirt(ireland.2PL.4D, 'F')

# Convert the factor loadings matrix to a data frame for easier manipulation
ireland.mirt.loads.df <- as.data.frame(ireland.mirt.loads)

# Add item codes (original row names) as a new column called 'code'
ireland.mirt.loads.df$code <- rownames(ireland.mirt.loads.df)

# Reshape the data frame from wide to long format for plotting or analysis
# Each row now represents a loading for a specific item and dimension
ireland.mirt.loads.df <- reshape2::melt(ireland.mirt.loads.df)

# Set loadings with an absolute value ≤ 0.3 to NA
# This removes weak loadings for clarity in visualizations or further analysis
ireland.mirt.loads.df[sapply(ireland.mirt.loads.df, is.numeric)] <- lapply(
  ireland.mirt.loads.df[sapply(ireland.mirt.loads.df, is.numeric)],
  function(x) {
    x[abs(x) <= 0.3] <- NA
    return(x)
  }
)

# Merge the long-format factor loading data with an external label dataframe
# 'labels.df' must contain a 'code' column to match on
# This step adds descriptive item labels or metadata to the loadings
ireland.mirt.loads.df <- merge(ireland.mirt.loads.df, labels.df, by = "code")

# Add ideological leaning to statement labels
ireland.mirt.loads.df <- ireland.mirt.loads.df %>%
  mutate(
    new_label = paste0(ideology, " ", new_label)  
  )

# Create a png file to save the heatmap

# The resolution is set to 1200 dpi for high-quality output, and the image size is specified in inches (6x6).
png("figs/ireland_mirt_loadings.png", units="in", width=7, height=6, res=1200)

# Sort the levels of the 'new_label' based on the substring starting from the 5th character
# Ensure there are no duplicate levels by using unique() before assigning the levels
ireland.mirt.loads.df$new_label <- factor(ireland.mirt.loads.df$new_label, 
                                             levels = unique(ireland.mirt.loads.df$new_label[order(substr(ireland.mirt.loads.df$new_label, 5, nchar(ireland.mirt.loads.df$new_label)))]))

# Create the heatmap using ggplot2
# The 'x' axis represents the factors, 'y' represents the ids (statements),
# and 'fill' represents the factor loading values.
ggplot(ireland.mirt.loads.df, aes(x = variable, y = new_label, fill = value)) + 
  geom_tile() +  # Create a tile-based heatmap
  geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))), size=2.5) + 
  scale_fill_gradient2(midpoint = 0, low = "#FFE338", high = "#FF0000", mid = "white", 
                       na.value = "lightgrey") +  # Color scale from yellow to red, with NA values in light grey
  labs(fill = "Loading") +  # Label for the color scale legend
  theme_minimal() +  # Apply a minimal theme for the plot
  scale_y_discrete(labels = function(y) {
    # Add color formatting
    y <- gsub("ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", y, perl = TRUE)  # Add color for ECFL
    y <- gsub("EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", y, perl = TRUE)  # Add color for EDUC
    y <- gsub("ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", y, perl = TRUE)  # Add color for ENEG
    y <- gsub("ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", y, perl = TRUE)  # Add color for ETHE
    y <- gsub("FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", y, perl = TRUE)  # Add color for FAED
    y <- gsub("WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", y, perl = TRUE)  # Add color for HSWF
    y <- gsub("IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", y, perl = TRUE)  # Add color for IMIN
    y <- gsub("CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", y, perl = TRUE)  # Add color for JLEF
    y <- gsub("CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", y, perl = TRUE)  # Add color for MECU
    y <- gsub("MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", y, perl = TRUE)  # Add color for MPTR
    y <- gsub("HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", y, perl = TRUE)  # Add color for SPHO
    y <- gsub("STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", y, perl = TRUE)  # Add color for SRPI
    return(y)
  }) +
  theme(axis.text.y = element_markdown(),
        axis.text.x = element_text(hjust = .5),  # Center the x-axis labels
        axis.title = element_blank(),  # Remove axis titles
        legend.title = element_text(size = 9, margin = margin(r = 0.3, t = -1.8, unit = "lines")),  # Customize the legend title text size and margins
        legend.position = "bottom",  # Place the legend at the bottom
        legend.direction = "horizontal",  # Set the legend items to be horizontal
        legend.key.height = unit(0.4, "cm"), # Reduce the height of the legend scale
        legend.key.width = unit(.8, "cm")) # Increase the width of the legend scale

dev.off()  # Close the png file and save the plot

# Select specific columns (code, variable, value, and new_label) from the 'ireland.efa.loads.df' dataframe
ireland.efa.loads.df <- ireland.efa.loads.df %>%
  dplyr::select(code, variable, value, new_label)

# Select specific columns (code, variable, value, and new_label) from the 'ireland.mirt.loads.df' dataframe
ireland.mirt.loads.df <- ireland.mirt.loads.df %>%
  dplyr::select(code, variable, value, new_label)

# Add a new column 'method' to the 'ireland.efa.loads.df' dataframe and assign the value "(a) Exploratory Factor Analysis" to it
ireland.efa.loads.df$method <- "(a) Exploratory Factor Analysis"

# Add a new column 'method' to the 'ireland.mirt.loads.df' dataframe and assign the value "(b) Item Response Theory" to it
ireland.mirt.loads.df$method <- "(b) Item Response Theory"

# Combine the two dataframes ('ireland.efa.loads.df' and 'ireland.mirt.loads.df') into one dataframe ('ireland.efa.loads.merged.df') by row-binding them
ireland.efa.loads.merged.df <- rbind(ireland.efa.loads.df, ireland.mirt.loads.df)

# Create a png file to save the heatmap

# The resolution is set to 1200 dpi for high-quality output, and the image size is specified in inches (6x6).
png("figs/ireland_loadings_combined.png", units="in", width=12, height=7, res=1200)

# Create the heatmap using ggplot2
# The 'x' axis represents the factors, 'y' represents the ids (statements),
# and 'fill' represents the factor loading values.
ggplot(ireland.efa.loads.merged.df, aes(x = variable, y = new_label, fill = value)) + 
  geom_tile() +  # Create a tile-based heatmap
  geom_text(aes(label = ifelse(is.na(value), "", sprintf("%.2f", value))), size=3.5) + 
  scale_fill_gradient2(midpoint = 0, low = "#FFE338", high = "#FF0000", mid = "white", 
                       na.value = "lightgrey") +  # Color scale from yellow to red, with NA values in light grey
  labs(fill = "Loading") +  # Label for the color scale legend
  theme_minimal() +  # Apply a minimal theme for the plot
  facet_wrap(~ method) +
  scale_y_discrete(labels = function(y) {
    # Add color formatting
    y <- gsub("ECONOMY(?=\\.)", "<span style='color:#800000;'><b>ECONOMY</b></span>", y, perl = TRUE)  # Add color for ECFL
    y <- gsub("EDUCATION(?=\\.)", "<span style='color:#9A6324;'><b>EDUCATION</b></span>", y, perl = TRUE)  # Add color for EDUC
    y <- gsub("ENVIRONMENT(?=\\.)", "<span style='color:#3cb44b;'><b>ENVIRONMENT</b></span>", y, perl = TRUE)  # Add color for ENEG
    y <- gsub("ETHICS(?=\\.)", "<span style='color:#f58231;'><b>ETHICS</b></span>", y, perl = TRUE)  # Add color for ETHE
    y <- gsub("FOREIGN(?=\\.)", "<span style='color:#ffe119;'><b>FOREIGN</b></span>", y, perl = TRUE)  # Add color for FAED
    y <- gsub("WELFARE(?=\\.)", "<span style='color:#bfef45;'><b>WELFARE</b></span>", y, perl = TRUE)  # Add color for HSWF
    y <- gsub("IMMIGRATION(?=\\.)", "<span style='color:#e6194B;'><b>IMMIGRATION</b></span>", y, perl = TRUE)  # Add color for IMIN
    y <- gsub("CRIME(?=\\.)", "<span style='color:#42d4f4;'><b>CRIME</b></span>", y, perl = TRUE)  # Add color for JLEF
    y <- gsub("CULTURE(?=\\.)", "<span style='color:#f032e6;'><b>CULTURE</b></span>", y, perl = TRUE)  # Add color for MECU
    y <- gsub("MOBILITY(?=\\.)", "<span style='color:#4363d8;'><b>MOBILITY</b></span>", y, perl = TRUE)  # Add color for MPTR
    y <- gsub("HOUSING(?=\\.)", "<span style='color:#000075;'><b>HOUSING</b></span>", y, perl = TRUE)  # Add color for SPHO
    y <- gsub("STATE(?=\\.)", "<span style='color:#911eb4;'><b>STATE</b></span>", y, perl = TRUE)  # Add color for SRPI
    return(y)
  }) +
  theme(axis.text.y = element_markdown(size=12),
        axis.text.x = element_text(size=12, hjust = .5),  # Center the x-axis labels
        axis.title = element_blank(),  # Remove axis titles
        legend.title = element_text(size = 12, margin = margin(r = 0.3, t = -1.8, unit = "lines")),  # Customize the legend title text size and margins
        legend.text = element_text(size = 12),
        legend.position = "bottom",  # Place the legend at the bottom
        legend.direction = "horizontal",  # Set the legend items to be horizontal
        legend.key.height = unit(0.4, "cm"), # Reduce the height of the legend scale
        legend.key.width = unit(.8, "cm"), # Increase the width of the legend scale
        strip.text = element_text(size = 14))  # Adjust the facet title size here

dev.off()  # Close the png file and save the plot

################################################################################
## CHECK IRT MODELS IMPROVEMENTS
################################################################################

set.seed(1234)  # For reproducibility

# -------------------------
# 1. Split data
# -------------------------

n <- nrow(votesmart.ireland.df.sample)
ireland.train.index <- sample(seq_len(n), size = floor(0.8 * n))

ireland.train.data <- votesmart.ireland.df.sample[ireland.train.index, ]
ireland.test.data  <- votesmart.ireland.df.sample[-ireland.train.index, ]

# -------------------------
# 2. Prepare test data
# -------------------------

ireland.test.items <- ireland.test.data[, !names(ireland.test.data) %in% "person_weights"]
ireland.test.weights <- ireland.test.data$person_weights

# -------------------------
# 3. Fit 2PL models on test data
# -------------------------

ireland.2PL.1D.test <- mirt(ireland.test.items, model = 1, itemtype = "2PL",
                             weights = ireland.test.weights, verbose = FALSE)

ireland.2PL.2D.test <- mirt(ireland.test.items, model = 2, itemtype = "2PL",
                             weights = ireland.test.weights, verbose = FALSE)

ireland.2PL.3D.test <- mirt(ireland.test.items, model = 3, itemtype = "2PL",
                             weights = ireland.test.weights, verbose = FALSE)

ireland.2PL.4D.test <- mirt(ireland.test.items, model = 4, itemtype = "2PL",
                             weights = ireland.test.weights, verbose = FALSE)

# -------------------------
# 4. Define safe extraction functions
# -------------------------

# Safe log-likelihood extraction
safe_logLik <- function(model) {
  out <- tryCatch(logLik(model), error = function(e) NA_real_)
  if (length(out) == 0) return(NA_real_) else return(as.numeric(out))
}

# Extract BIC directly from the internal slot
safe_BIC <- function(model) {
  out <- tryCatch(model@Fit$BIC, error = function(e) NA_real_)
  if (length(out) == 0) return(NA_real_) else return(as.numeric(out))
}

# -------------------------
# 5. Extract logLik & BIC for each model
# -------------------------

ireland.logliks <- c(
  safe_logLik(ireland.2PL.1D.test),
  safe_logLik(ireland.2PL.2D.test),
  safe_logLik(ireland.2PL.3D.test),
  safe_logLik(ireland.2PL.4D.test)
)

ireland.bics <- c(
  safe_BIC(ireland.2PL.1D.test),
  safe_BIC(ireland.2PL.2D.test),
  safe_BIC(ireland.2PL.3D.test),
  safe_BIC(ireland.2PL.4D.test)
)

# -------------------------
# 6. Create data.frame with model fit stats
# -------------------------

ireland.irt.fit.stats <- data.frame(
  Model = c("2PL_1D", "2PL_2D", "2PL_3D", "2PL_4D"),
  Dimension = 1:4,
  LogLikelihood = ireland.logliks,
  BIC = ireland.bics
)

# -------------------------
# 7. Fit structureless baseline model (intercept-only GLMs)
# -------------------------

baseline_models <- lapply(ireland.test.items, function(item) {
  glm(item ~ 1, family = binomial(link = "logit"))
})

baseline_logLik <- sum(sapply(baseline_models, function(m) as.numeric(logLik(m))))
baseline_BIC <- sum(sapply(baseline_models, BIC))

# Add baseline row
ireland.irt.fit.stats <- rbind(
  data.frame(
    Model = "NoStructure_Baseline",
    Dimension = 0,
    LogLikelihood = baseline_logLik,
    BIC = baseline_BIC
  ),
  ireland.irt.fit.stats
)

# -------------------------
# 8. Compute improvements
# -------------------------

# Sort by number of dimensions
ireland.irt.fit.stats <- ireland.irt.fit.stats[order(ireland.irt.fit.stats$Dimension), ]

# Differences vs. null (absolute improvement)
ireland.irt.fit.stats$LogLik_vs_Null <- ireland.irt.fit.stats$LogLikelihood - baseline_logLik
ireland.irt.fit.stats$BIC_vs_Null <- ireland.irt.fit.stats$BIC - baseline_BIC

# Differences vs. previous model (incremental improvement)
ireland.irt.fit.stats$LogLik_vs_Prev <- c(NA, diff(ireland.irt.fit.stats$LogLikelihood))
ireland.irt.fit.stats$BIC_vs_Prev <- c(NA, diff(ireland.irt.fit.stats$BIC))

# -------------------------
# 9. Display results and diagnostics
# -------------------------

print(ireland.irt.fit.stats)
print(xtable(ireland.irt.fit.stats), type = "latex", include.rownames = FALSE)

# Check which models were extracted successfully
model_status <- data.frame(
  Model = ireland.irt.fit.stats$Model,
  LogLik_OK = !is.na(ireland.irt.fit.stats$LogLikelihood),
  BIC_OK = !is.na(ireland.irt.fit.stats$BIC)
)

print(model_status)